%load_ext autoreload
%autoreload 2
import os
import sys
sys.path.append("/Users/efim/PycharmProjects/")
sys.path.append("/Users/efim/PycharmProjects/SimpleAlgoTrade/model")
from SimpleAlgoTrade.model import FeatureEngineering as fe
from SimpleAlgoTrade.model.utils import ic_metric
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, make_union
from sklearn.inspection import PartialDependenceDisplay
from sklearn.metrics import mean_squared_error
from statsmodels.api import OLS, add_constant
import lightgbm as lgb
from hyperopt import hp, tpe, Trials
from hyperopt.fmin import fmin
from hyperopt.pyll import scope
import hyperopt
import tqdm
from typing import Dict
from plotly import express as px
from plotly import graph_objects as go
from plotly import offline as pyo
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
matplotlib.rcParams['figure.figsize'] = (15, 10)
SEED = 123456789
np.random.seed(SEED)
try:
os.chdir("../DataBase/files")
except FileNotFoundError:
pass
!ls -a
. .. .DS_Store Archive price_execution_data_list_16-08-2022-13-24-21.json price_execution_data_list_16-08-2022-15-32-04.json trade_book_data_16-08-2022-13-24-21.json trade_book_data_16-08-2022-15-32-04.json
file_name = "price_execution_data_list"
def read_data_files(name_like: str):
data_dict = {}
for file_name in os.listdir():
if name_like in file_name:
data_dict[file_name] = pd.read_json(file_name)
return data_dict
def aggregate_dict_to_dataframe(dictinary):
df = pd.DataFrame()
for key, val in dictinary.items():
df = pd.concat([df, dictinary[key]], axis = 0)
df.set_index("time", inplace=True)
return df
price_execution_data = read_data_files(file_name)
price_execution_data_df = aggregate_dict_to_dataframe(price_execution_data)
del price_execution_data
price_execution_data_df = price_execution_data_df.sort_index()
price_execution_data_df.head()
| symbol | price | delta_time | |
|---|---|---|---|
| time | |||
| 1.660649e+09 | BTCUSDT | 24071.72 | 1.332418 |
| 1.660649e+09 | BTCUSDT | 24071.40 | 0.611654 |
| 1.660649e+09 | BTCUSDT | 24072.47 | 1.330175 |
| 1.660649e+09 | BTCUSDT | 24073.85 | 0.592218 |
| 1.660649e+09 | BTCUSDT | 24072.48 | 0.634877 |
time_lag = 13 # the tick that the model will predict if 1 than the next tick will be used as y
if time_lag != 1:
feature_shift_return_most_recent = make_pipeline(fe.FeatureSelector('price'), fe.CalcShift(1),fe.CalcReturn())
#feature_shift_return_most_recent
feature_shift_returns = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcShift(i * time_lag),fe.CalcReturn()) for i in range(1,11)))
# feature_shift_returns
feature_ma = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcMa(i * time_lag), fe.CalcReturn()) for i in range(1,11)))
# feature_ma
feature_bb_low = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcBB(i * time_lag,"low"), fe.CalcReturn()) for i in range(1,11)))
# feature_bb_low
feature_bb_high = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcBB(i * time_lag,"high"), fe.CalcReturn()) for i in range(1,11)))
# feature_bb_high
feature_quantile_001 = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcReturn(), fe.CalcQuantile(i * time_lag,0.01)) for i in range(1,11)))
# feature_quantile_001
feature_quantile_099 = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcReturn(), fe.CalcQuantile(i * time_lag,0.99)) for i in range(1,11)))
# feature_quantile_099
feature_std= fe.make_union(*(make_pipeline(fe.FeatureSelector('price'),fe.CalcReturn(), fe.CalcStd(i * time_lag)) for i in range(1,11)))
# feature_std
feature_rsi = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'),fe.CalcRsi(i*time_lag)) for i in range(1,11)))
# feature_rsi
feature_ppo = make_pipeline(fe.FeatureSelector('price'), fe.CalcPpo())
# feature_ppo
feature_macd = make_pipeline(fe.FeatureSelector('price'), fe.CalcMacd())
# feature_macd
target = make_pipeline(fe.FeatureSelector('price'), fe.CalcShift(-time_lag),fe.CalcReturn())
# target
feature_union = fe.make_union(feature_shift_return_most_recent,
feature_shift_returns,
feature_ma,
feature_bb_low,
feature_bb_high,
feature_quantile_001,
feature_quantile_099,
feature_std,
feature_rsi,
feature_ppo,
feature_macd,
target)
feature_union
PandasFeatureUnion(transformer_list=[('pipeline-1',
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calcshift',
CalcShift(shift_val=1)),
('calcreturn',
CalcReturn())])),
('pandasfeatureunion-1',
PandasFeatureUnion(transformer_list=[('pipeline-1',
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calc...
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calcppo', CalcPpo())])),
('pipeline-3',
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calcmacd',
CalcMacd())])),
('pipeline-4',
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calcshift',
CalcShift(shift_val=-13)),
('calcreturn',
CalcReturn())]))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. PandasFeatureUnion(transformer_list=[('pipeline-1',
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calcshift',
CalcShift(shift_val=1)),
('calcreturn',
CalcReturn())])),
('pandasfeatureunion-1',
PandasFeatureUnion(transformer_list=[('pipeline-1',
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calc...
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calcppo', CalcPpo())])),
('pipeline-3',
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calcmacd',
CalcMacd())])),
('pipeline-4',
Pipeline(steps=[('featureselector',
FeatureSelector(feature_names='price')),
('calcshift',
CalcShift(shift_val=-13)),
('calcreturn',
CalcReturn())]))])FeatureSelector(feature_names='price')
CalcShift(shift_val=1)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=13)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=26)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=39)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=52)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=65)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=78)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=91)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=104)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=117)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=130)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=13)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=26)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=39)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=52)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=65)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=78)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=91)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=104)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=117)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=130)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=13)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=26)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=39)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=52)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=65)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=78)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=91)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=104)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=117)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=130)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=13, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=26, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=39, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=52, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=65, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=78, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=91, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=104, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=117, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=130, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=13, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=26, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=39, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=52, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=65, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=78, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=91, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=104, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=117, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=130, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=13, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=26, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=39, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=52, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=65, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=78, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=91, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=104, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=117, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=130, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=13)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=26)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=39)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=52)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=65)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=78)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=91)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=104)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=117)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=130)
FeatureSelector(feature_names='price')
CalcRsi(period=13)
FeatureSelector(feature_names='price')
CalcRsi(period=26)
FeatureSelector(feature_names='price')
CalcRsi(period=39)
FeatureSelector(feature_names='price')
CalcRsi(period=52)
FeatureSelector(feature_names='price')
CalcRsi(period=65)
FeatureSelector(feature_names='price')
CalcRsi(period=78)
FeatureSelector(feature_names='price')
CalcRsi(period=91)
FeatureSelector(feature_names='price')
CalcRsi(period=104)
FeatureSelector(feature_names='price')
CalcRsi(period=117)
FeatureSelector(feature_names='price')
CalcRsi(period=130)
FeatureSelector(feature_names='price')
CalcPpo()
FeatureSelector(feature_names='price')
CalcMacd()
FeatureSelector(feature_names='price')
CalcShift(shift_val=-13)
CalcReturn()
data = feature_union.fit_transform(price_execution_data_df)
data = data.dropna()
train_size = 0.8
test_size = 1-train_size
data_size = data.shape[0]
data_train = data.iloc[:int(data_size*train_size)]
data_test = data.iloc[int(data_size*train_size):]
target_names = [f"returns_forward_{time_lag}"]
feature_names = data.columns[:-1]
X_train, y_train = data_train[feature_names], data_train[target_names]
X_test, y_test = data_test[feature_names], data_test[target_names]
X_train.shape
(15895, 83)
X_test.shape
(3974, 83)
# Time Series Cross Validation
def time_series_cross_validation(X: pd.DataFrame,
y: pd.DataFrame,
init_train_size:int,
val_cv:int,
model: callable,
params: Dict):
"""
X - feature data
y - target data
init_train_size - initial train data size
cv - cross validation size
"""
scores = {"train_ic":[],
"val_ic":[],
"sample_size":[],
"train_rmse": [],
"val_rmse": []}
regressor = model(**params)
for sample_size in range(init_train_size, X.shape[0], val_cv):
X_train = X.iloc[:sample_size]
y_train = y.iloc[:sample_size]
X_val = X.iloc[sample_size:sample_size+val_cv]
y_val = y.iloc[sample_size:sample_size+val_cv]
regressor.fit(X_train,y_train)
y_train_pred = regressor.predict(X_train)
y_val_pred = regressor.predict(X_val)
scores["train_ic"].append(ic_metric(y_train, y_train_pred))
scores["val_ic"].append(ic_metric(y_val, y_val_pred))
scores["train_rmse"].append(mean_squared_error(y_train,y_train_pred))
scores["val_rmse"].append(mean_squared_error(y_val, y_val_pred))
scores["sample_size"].append(sample_size)
sample_size += val_cv
return scores
# Setup objective function
def objective(params):
cv_config = params["cv_config"]
X, y , init_train_size, val_cv, model = cv_config.get("X"), cv_config.get("y"), cv_config.get("init_train_size"), cv_config.get("val_cv"), cv_config.get("model")
scores = time_series_cross_validation(X,
y,
init_train_size,
val_cv,
model,
params["space"])
return -np.mean(scores["val_ic"])
# CV Config
cv_config = {"X": X_train,
"y": y_train,
"init_train_size": 10000,
"val_cv": 250,
"model": lgb.LGBMRegressor}
# Uninformative Priors
space = {
'learning_rate': hp.uniform('learning_rate', 0.01, 2),
'num_leaves': scope.int(hp.uniform('num_leaves', 2, 20)),
'max_depth': scope.int(hp.uniform('max_depth', 1, 20)),
'subsample': hp.uniform('subsample', 0.6, 1),
'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1), #feature_fraction
'num_iterations': scope.int(hp.uniform('num_iterations',5, 400)),
# 'reg_alpha': hp.choice("reg_alpha", np.arange(0, 2, 0.1)), #L1 can shirnk most of the features to zero, which will produce straight line pred
'reg_lambda': hp.uniform("reg_lambda",0, 5), #L2
'seed': SEED
}
params = {"cv_config":cv_config,
"space": space}
trials = Trials()
best = fmin(fn=objective,
space=params,
algo=tpe.suggest,
max_evals=250,
trials=trials,
rstate=np.random.default_rng(SEED))
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [39:09<00:00, 9.40s/trial, best loss: -0.03382063418707294]
best
{'colsample_bytree': 0.9073245190249477,
'learning_rate': 0.8362765112862323,
'max_depth': 5.795458069323013,
'num_iterations': 154.30736269988873,
'num_leaves': 11.409479385608066,
'reg_lambda': 2.8689492539762513,
'subsample': 0.8105926533316665}
def unpack(x):
if x:
return x[0]
return np.nan
trials_df = pd.DataFrame([pd.Series(t["misc"]["vals"]).apply(unpack) for t in trials])
trials_df["loss"] = [t["result"]["loss"] for t in trials]
trials_df["trial_number"] = trials_df.index
int_col_names = ["max_depth", "num_iterations","num_leaves"]
for col_name in int_col_names:
trials_df[col_name] = trials_df[col_name].apply(lambda x: round(x))
trials_df["corr"] = trials_df["loss"].apply(lambda x: -x) # the hyperopt goal is to minimize -corr
trials_df.sort_values(by="corr", ascending=False)
| colsample_bytree | learning_rate | max_depth | num_iterations | num_leaves | reg_lambda | subsample | loss | trial_number | corr | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 0.907325 | 0.836277 | 6 | 154 | 11 | 2.868949 | 0.810593 | -0.033821 | 2 | 0.033821 |
| 56 | 0.827558 | 0.884786 | 7 | 344 | 16 | 2.498218 | 0.905005 | -0.031934 | 56 | 0.031934 |
| 113 | 0.897715 | 0.827892 | 15 | 175 | 19 | 3.895779 | 0.995305 | -0.026505 | 113 | 0.026505 |
| 89 | 0.732165 | 0.549636 | 19 | 268 | 14 | 0.883844 | 0.744865 | -0.026131 | 89 | 0.026131 |
| 130 | 0.915108 | 1.372797 | 17 | 161 | 18 | 3.946307 | 0.905825 | -0.024781 | 130 | 0.024781 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39 | 0.818831 | 1.259677 | 17 | 49 | 14 | 3.664916 | 0.812888 | 0.021504 | 39 | -0.021504 |
| 232 | 0.836154 | 0.710190 | 6 | 302 | 13 | 4.280225 | 0.844149 | 0.024907 | 232 | -0.024907 |
| 181 | 0.905013 | 0.761663 | 12 | 262 | 11 | 3.727888 | 0.815731 | 0.026336 | 181 | -0.026336 |
| 216 | 0.927756 | 0.826142 | 11 | 231 | 19 | 3.412954 | 0.719116 | 0.029589 | 216 | -0.029589 |
| 45 | 0.600650 | 1.055966 | 18 | 118 | 7 | 2.182601 | 0.656977 | 0.033866 | 45 | -0.033866 |
250 rows × 10 columns
# Save trials data
path = "/Users/efim/PycharmProjects/SimpleAlgoTrade/model/data/"
trials_df.to_csv(path+"light_gbm_trials.csv")
px.scatter(trials_df, x="trial_number", y="corr")
fig = go.Figure(
data=go.Contour(
z=trials_df["corr"],
x=trials_df["num_iterations"],
y=trials_df["num_leaves"],
contours=dict(
showlabels=True, # show labels on contours
labelfont=dict(size=12, color="white",), # label font properties
),
colorbar=dict(title="corr", titleside="right",),
hovertemplate="corr: %{z}<br>num_iterations: %{x}<br>num_leaves: %{y}<extra></extra>",
)
)
fig.update_layout(
xaxis_title="num_iterations",
yaxis_title="num_leaves",
title={
"text": "num_iterations vs. num_leaves",
"xanchor": "center",
"yanchor": "top",
"x": 0.5,
},
margin=dict(l=10, r=10, t=10, b=10)
)
learning_rate vs num_iterations
fig = go.Figure(
data=go.Contour(
z=trials_df["corr"],
x=trials_df["num_iterations"],
y=trials_df["learning_rate"],
contours=dict(
showlabels=True, # show labels on contours
labelfont=dict(size=12, color="white",), # label font properties
),
colorbar=dict(title="corr", titleside="right",),
hovertemplate="corr: %{z}<br>num_iterations: %{x}<br>learning_rate: %{y}<extra></extra>",
)
)
fig.update_layout(
xaxis_title="num_iterations",
yaxis_title="learning_rate",
title={
"text": "num_iterations vs. learning_rate",
"xanchor": "center",
"yanchor": "top",
"x": 0.5,
},
margin=dict(l=10, r=10, t=10, b=10)
)
def map_to_int(params):
params_mapped = {}
for key, val in params.items():
if key in int_col_names:
params_mapped[key] = round(val)
else:
params_mapped[key] = val
return params_mapped
best_params = map_to_int(best)
print(best_params)
{'colsample_bytree': 0.9073245190249477, 'learning_rate': 0.8362765112862323, 'max_depth': 6, 'num_iterations': 154, 'num_leaves': 11, 'reg_lambda': 2.8689492539762513, 'subsample': 0.8105926533316665}
scores = time_series_cross_validation(X_train,
y_train,
cv_config["init_train_size"],
cv_config["val_cv"],
cv_config["model"],
best_params)
plt.plot(scores["train_ic"], label="Train IC")
plt.plot(scores["val_ic"], label="Val IC")
plt.legend()
<matplotlib.legend.Legend at 0x7fdfb2ef3610>
np.mean(scores["val_ic"])
0.009715709631665424
model = lgb.LGBMRegressor(**best_params)
model.fit(X_train, y_train)
LGBMRegressor(colsample_bytree=0.9073245190249477,
learning_rate=0.8362765112862323, max_depth=6, num_iterations=154,
num_leaves=11, reg_lambda=2.8689492539762513,
subsample=0.8105926533316665)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. LGBMRegressor(colsample_bytree=0.9073245190249477,
learning_rate=0.8362765112862323, max_depth=6, num_iterations=154,
num_leaves=11, reg_lambda=2.8689492539762513,
subsample=0.8105926533316665)top = 50
df_feat = (
pd.DataFrame({"feat_importance": model.feature_importances_,
"feature_name": X_train.columns})
.sort_values(by="feat_importance", ascending=False)
)
df_feat.head(top).plot.bar(x="feature_name", y ="feat_importance")
plt.title(f"Top {top} feature imporance")
Text(0.5, 1.0, 'Top 50 feature imporance')
ols = OLS(endog=trials_df['corr'], exog=add_constant(trials_df.drop(['loss','trial_number','corr'], axis=1))).fit()
print(ols.summary())
OLS Regression Results
==============================================================================
Dep. Variable: corr R-squared: 0.045
Model: OLS Adj. R-squared: 0.018
Method: Least Squares F-statistic: 1.636
Date: Fri, 16 Sep 2022 Prob (F-statistic): 0.126
Time: 11:27:17 Log-Likelihood: 768.12
No. Observations: 250 AIC: -1520.
Df Residuals: 242 BIC: -1492.
Df Model: 7
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
const -0.0052 0.009 -0.584 0.560 -0.023 0.012
colsample_bytree -0.0066 0.008 -0.841 0.401 -0.022 0.009
learning_rate -0.0032 0.002 -1.805 0.072 -0.007 0.000
max_depth 9.801e-05 0.000 0.699 0.485 -0.000 0.000
num_iterations -1.569e-06 7.91e-06 -0.198 0.843 -1.72e-05 1.4e-05
num_leaves -0.0001 0.000 -0.613 0.540 -0.000 0.000
reg_lambda 0.0005 0.001 0.899 0.369 -0.001 0.002
subsample 0.0192 0.007 2.602 0.010 0.005 0.034
==============================================================================
Omnibus: 1.314 Durbin-Watson: 1.965
Prob(Omnibus): 0.518 Jarque-Bera (JB): 1.024
Skew: -0.132 Prob(JB): 0.599
Kurtosis: 3.170 Cond. No. 3.60e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.6e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
pd.concat([ols.params.to_frame('coef'),
ols.conf_int().rename({0:"lower",1:"upper"},axis=1),
], axis=1)
| coef | lower | upper | |
|---|---|---|---|
| const | -0.005240 | -0.022913 | 0.012432 |
| colsample_bytree | -0.006641 | -0.022202 | 0.008920 |
| learning_rate | -0.003170 | -0.006630 | 0.000289 |
| max_depth | 0.000098 | -0.000178 | 0.000374 |
| num_iterations | -0.000002 | -0.000017 | 0.000014 |
| num_leaves | -0.000106 | -0.000445 | 0.000234 |
| reg_lambda | 0.000526 | -0.000627 | 0.001680 |
| subsample | 0.019159 | 0.004653 | 0.033665 |
Note: Correlation in between features are ignored
num_feat = 10
fig, ax = plt.subplots(10,1, figsize=(10, 90))
for i in range(num_feat):
feat = df_feat["feature_name"][i]
PartialDependenceDisplay.from_estimator(model, X_train, [feat], kind='both', ax=ax[i])
plt.plot(y_train.values, label = "actual")
plt.plot(model.predict(X_train), label ="pred")
plt.title("Train Set")
plt.legend()
<matplotlib.legend.Legend at 0x7fdfa49b49a0>
plt.plot(y_test.values, label = "actual")
plt.plot(model.predict(X_test), label ="pred")
plt.title("Test Set")
plt.legend()
<matplotlib.legend.Legend at 0x7fdfb4a71d50>
y_train_pred = pd.Series(model.predict(X_train), index = X_train.index, name=f"train_return_pred_{time_lag}")
price_execution_data_df[f"price_shift_{time_lag}"] = price_execution_data_df["price"].shift(-time_lag)
price_execution_data_df[f"price_pct_change_{time_lag}"] = price_execution_data_df["price"].shift(-time_lag).pct_change()
price_execution_data_df["multiplicator"] = price_execution_data_df[f"price_shift_{time_lag}"].shift(1)
full_data_train = price_execution_data_df.merge(y_train, left_index=True, right_index=True)
full_data_train = full_data_train.merge(y_train_pred, left_index=True, right_index=True)
full_data_train.head(10)
| symbol | price | delta_time | price_shift_13 | price_pct_change_13 | multiplicator | returns_forward_13 | train_return_pred_13 | |
|---|---|---|---|---|---|---|---|---|
| time | ||||||||
| 1.660649e+09 | BTCUSDT | 24078.99 | 0.613602 | 24078.93 | 0.000155 | 24075.19 | 0.000155 | 0.000026 |
| 1.660649e+09 | BTCUSDT | 24080.96 | 0.613479 | 24077.98 | -0.000039 | 24078.93 | -0.000039 | -0.000017 |
| 1.660649e+09 | BTCUSDT | 24079.51 | 1.428748 | 24078.31 | 0.000014 | 24077.98 | 0.000014 | -0.000016 |
| 1.660649e+09 | BTCUSDT | 24078.57 | 0.614810 | 24078.82 | 0.000021 | 24078.31 | 0.000021 | -0.000064 |
| 1.660649e+09 | BTCUSDT | 24079.09 | 0.613618 | 24078.31 | -0.000021 | 24078.82 | -0.000021 | -0.000039 |
| 1.660649e+09 | BTCUSDT | 24079.84 | 0.613721 | 24078.74 | 0.000018 | 24078.31 | 0.000018 | 0.000016 |
| 1.660649e+09 | BTCUSDT | 24078.14 | 0.613349 | 24076.06 | -0.000111 | 24078.74 | -0.000111 | -0.000043 |
| 1.660649e+09 | BTCUSDT | 24078.53 | 1.432766 | 24074.90 | -0.000048 | 24076.06 | -0.000048 | -0.000018 |
| 1.660649e+09 | BTCUSDT | 24075.34 | 1.334504 | 24078.35 | 0.000143 | 24074.90 | 0.000143 | 0.000011 |
| 1.660649e+09 | BTCUSDT | 24073.91 | 0.612562 | 24078.91 | 0.000023 | 24078.35 | 0.000023 | -0.000031 |
full_data_train.shape
(15895, 8)
assert ((full_data_train["multiplicator"]*(1+full_data_train[f"returns_forward_{time_lag}"]) - full_data_train[f"price_shift_{time_lag}"]) < 1e-6).all(), "Oops Prices are not alligned"
full_data_train[f"train_price_pred_{time_lag}"] = full_data_train["multiplicator"]*(1+full_data_train[f"train_return_pred_{time_lag}"])
full_data_train[f"train_price_pred_{time_lag}"].iloc[:100].plot(label="y_pred")
full_data_train[f"price_shift_{time_lag}"].iloc[:100].plot(label="y_true")
plt.legend()
<matplotlib.legend.Legend at 0x7fdfa51fdfc0>
y_test_pred = pd.Series(model.predict(X_test), index = X_test.index, name=f"test_return_pred_{time_lag}")
price_execution_data_df[f"price_shift_{time_lag}"] = price_execution_data_df["price"].shift(-time_lag)
price_execution_data_df[f"price_pct_change_{time_lag}"] = price_execution_data_df["price"].shift(-time_lag).pct_change()
price_execution_data_df["multiplicator"] = price_execution_data_df[f"price_shift_{time_lag}"].shift(1)
full_data_test = price_execution_data_df.merge(y_test, left_index=True, right_index=True)
full_data_test = full_data_test.merge(y_test_pred, left_index=True, right_index=True)
full_data_test.dropna(inplace=True)
full_data_test.head(10)
| symbol | price | delta_time | price_shift_13 | price_pct_change_13 | multiplicator | returns_forward_13 | test_return_pred_13 | |
|---|---|---|---|---|---|---|---|---|
| time | ||||||||
| 1.660661e+09 | BTCUSDT | 23757.88 | 0.656786 | 23763.33 | 0.000023 | 23762.78 | 0.000023 | 0.000015 |
| 1.660661e+09 | BTCUSDT | 23756.04 | 0.613033 | 23764.46 | 0.000048 | 23763.33 | 0.000048 | -0.000004 |
| 1.660661e+09 | BTCUSDT | 23759.18 | 0.612939 | 23762.64 | -0.000077 | 23764.46 | -0.000077 | 0.000031 |
| 1.660661e+09 | BTCUSDT | 23759.27 | 0.612498 | 23757.75 | -0.000206 | 23762.64 | -0.000206 | 0.000030 |
| 1.660661e+09 | BTCUSDT | 23759.75 | 1.330198 | 23758.22 | 0.000020 | 23757.75 | 0.000020 | 0.000073 |
| 1.660661e+09 | BTCUSDT | 23760.90 | 0.715904 | 23759.08 | 0.000036 | 23758.22 | 0.000036 | 0.000044 |
| 1.660661e+09 | BTCUSDT | 23765.09 | 0.614189 | 23760.25 | 0.000049 | 23759.08 | 0.000049 | -0.000013 |
| 1.660661e+09 | BTCUSDT | 23766.28 | 0.612893 | 23759.44 | -0.000034 | 23760.25 | -0.000034 | -0.000024 |
| 1.660661e+09 | BTCUSDT | 23767.16 | 0.576914 | 23757.04 | -0.000101 | 23759.44 | -0.000101 | 0.000011 |
| 1.660661e+09 | BTCUSDT | 23765.95 | 0.650788 | 23756.31 | -0.000031 | 23757.04 | -0.000031 | -0.000006 |
assert ((full_data_test["multiplicator"]*(1+full_data_test[f"returns_forward_{time_lag}"]) - full_data_test[f"price_shift_{time_lag}"]) < 1e-6).all(), "Oops Prices are not alligned"
full_data_test[f"test_price_pred_{time_lag}"] = full_data_test["multiplicator"]*(1+full_data_test[f"test_return_pred_{time_lag}"])
full_data_test[f"test_price_pred_{time_lag}"].iloc[:100].plot(label="y_pred")
full_data_test[f"price_shift_{time_lag}"].iloc[:100].plot(label="y_true")
plt.legend()
<matplotlib.legend.Legend at 0x7fdfa522c8e0>
# # Train Data
path = "/Users/efim/PycharmProjects/SimpleAlgoTrade/model/data/"
# full_data_train[["price","price_shift_13","train_price_pred_13"]].to_csv(path+"train_pred.csv")
# # Test Data
# full_data_test[["price","price_shift_13","test_price_pred_13"]].to_csv(path+"test_pred.csv")
y_train_pred = pd.merge(pd.Series(model.predict(X_train), index = X_train.index, name="train_return_pred_13"),
price_execution_data_df["price"],
left_index=True,
right_index=True)
y_test_pred = pd.merge(pd.Series(model.predict(X_test), index = X_test.index, name="test_return_pred_13"),
price_execution_data_df["price"],
left_index=True,
right_index=True)
y_train_pred.to_csv(path+"train_pred.csv")
y_test_pred.to_csv(path+"test_pred.csv")